#
# Organize mobility regressions
# 

#==============================================
# Preamble
#==============================================

START_TIME = Sys.time()

scripts.dir <- dirname(sys.frame(1)$ofile)
print(scripts.dir)
setwd(scripts.dir)

setwd('..')
in.dir = paste(getwd(), 'data', sep='/')
out.dir = paste(getwd(), 'output', sep='/')
log.dir = paste(getwd(), 'log', sep='/')

library('reshape')
library('dplyr')
library('stringr')

options(stringsAsFactors=FALSE)

#==============================================
# Read in estimates
#==============================================

print(in.dir)
setwd(in.dir)

files.0 = data.frame(f.n=list.files())
files.0$one = 1

files.1 = files.0[grepl('^track_mob_2-[a-z0-9_]{1,20}.csv', files.0$f.n), ]

means.0 = read.csv('track_mob_3-t7t8-means.csv')

data.10 = NULL
obs.10 = NULL

# 
for(i in 1:nrow(files.1)){
  print(in.dir)
  setwd(in.dir)
  fn = files.1$f.n[i]
  data.0 = as.matrix(read.csv(fn))
  data.0[,] = gsub('^=', '', data.0[,])
  data.1 = as.data.frame(t(data.0[2:(nrow(data.0)-4), 2:ncol(data.0)]))
  colnames(data.1) = data.0[2:(nrow(data.0)-4), 1]
  data.1$spec = c(data.0[1, 2:ncol(data.0)])
  
  for(j in 1:nrow(data.1)){
    if(nchar(gsub('\\s+', '', data.1$spec[j])) == 0){
      data.1$spec[j] = data.1$spec[j-1]
    }
  }
  
  data.1$stat = ifelse((1:nrow(data.1) %% 2) == 1, 'b', 'se')
  data.2 = as.data.frame(melt(data.1, id.vars=c('spec', 'stat')))
  names(data.2) = c('spec', 'stat', 'x', 'cell.str')
  data.2$x = as.character(data.2$x)
  
  reg.0 = '([a-z0-9_-]{1,30})::([a-z0-9_-]{1,20})::([a-z0-9_-]{1,20})(::([a-z0-9_-]{1,10})::([a-z0-9_-]{1,10}))?'
  a = str_match(data.2$spec, reg.0)
  data.2$y = a[, 2]
  data.2$target = a[, 3]
  data.2$fe = a[, 4]
  data.2$flex = ifelse(is.na(a[, 6]), 'none', a[, 6])
  data.2$samp = ifelse(is.na(a[, 7]), 'none', a[, 7])
  
  data.3 = data.2
  
  obs.0 = data.frame(obs=as.numeric(data.0[nrow(data.0)-4, 2:ncol(data.0)]),
                     nclust=as.numeric(data.0[nrow(data.0)-1, 2:ncol(data.0)]))
  obs.0$spec = data.1$spec
  obs.1 = obs.0[(1:nrow(data.1) %% 2) == 1, ]
  a = str_match(obs.1$spec, reg.0)
  obs.1$y = a[, 2]
  obs.1$target = a[, 3]
  obs.1$fe = a[, 4]
  obs.1$flex = ifelse(is.na(a[, 6]), 'none', a[, 6])
  obs.1$samp = ifelse(is.na(a[, 7]), 'none', a[, 7])
  
  obs.3 = obs.1
  
  obs.10 = rbind(obs.10, obs.3)
  data.10 = rbind(data.10, data.3)
}

data.10$tag = with(data.10, paste('tag', y, target, fe, flex, samp, spec, x, stat, 'end', sep='.'))
look.0 = data.10[order(data.10$tag), ]
look.1 = look.0[!duplicated(look.0$tag), ]

data.11 = as.data.frame(cast(look.1, y + target + fe + flex + samp + spec + x ~ stat,
                             value='cell.str'))
data.12 = data.11[data.11$x != 'N', ]
data.12$se.0 = with(data.12, as.numeric(gsub('[\\(\\)]', '', se)))
data.12$b.0 = with(data.12, as.numeric(b))

probs.0 = c(0.1, 0.05, 0.01)
probs.1 = abs(qnorm(probs.0 / 2))


#
# pretty coefficient strings
#
data.12$t.0 = with(data.12, abs(b.0 / se.0))
data.12$se.1 = with(data.12, formatC(as.numeric(round(se.0 * 1000) / 1000),
                                     format='f', flag='0', digits=3))
data.12$se.2 = with(data.12, ifelse(is.na(se.0), '',
                                    paste('="(', se.1, ')"', sep='')))
data.12$stars = as.character(cut(data.12$t.0, breaks=c(-Inf, probs.1, Inf),
                               labels=c('', '*', '**', '***')))
data.12$b.1 = with(data.12, formatC(as.numeric(round(b.0 * 1000) / 1000),
                                    format='f', flag='0', digits=3))
data.12$b.2 = with(data.12, ifelse(is.na(se.0), '',
                                   paste('="', b.1, stars, '"', sep='')))

data.13 = data.12[, 1:7]
data.13$b = data.12$b.2
data.13$se = data.12$se.2

means.0$y = gsub('_half$', '', means.0$var_name)
means.0$tag = with(means.0, paste('tag', y, samp, meas, 'end', sep='.'))
rownames(means.0) = means.0$tag
data.13$meas = with(data.13, gsub('_.*$', '', gsub('trk_', '', target)))
data.13$tag = with(data.13, paste('tag', gsub('^mob_', '', as.character(y)), samp, meas, 'end', sep='.'))
data.13$ymean = means.0[data.13$tag, 'pred_']

id.vars.0 = c('y', 'target', 'fe', 'flex', 'samp', 'spec', 'x')
measure.vars.0 = c('b', 'se', 'ymean')
data.14 = as.data.frame(melt(data.13, id.vars=id.vars.0,
                             measure.vars=measure.vars.0))
names(data.14) = c(names(data.14)[1:7], 'stat', 'cell.str')
data.14$pred_pct = gsub('_[a-zA-Z_]{1,20}[0-9]{1,10}$', '', data.14$y)
data.14$y_var = gsub('^mob_[0-9]{2}_', '', data.14$y)
data.14 = data.14 %>%
  mutate(col.prio = case_when(
    fe == 'y' ~ 1,
    fe == 'yd' ~ 2,
    fe == 'none' ~ 3,
    fe == 'flex' ~ 4,
    TRUE ~ 5
  ))
data.15 = data.14[data.14$stat != 'ymean', ]
data.15$tag = with(data.15, paste('tag', y_var, target, pred_pct, x, col.prio, fe, flex, samp, stat, 'end', sep='.'))

data.16 = as.data.frame(cast(data.15, y_var + target + pred_pct + x ~
                               col.prio + fe + flex + samp + stat, value='cell.str'))

#==============================================
# Write long aggregated coefficients table
#==============================================

print(out.dir)
setwd(out.dir)

write.csv(data.16, 'track_mob_4.csv', na='', row.names=F)

reg.0 = 'trk_((unadj)|(rel))_((cy)|(dy))_([dba]m)'
a = str_match(data.14$target, reg.0)
data.14$meas = a[, 2]
data.14$instr.lvl = a[, 5]
data.14$instr.samp = a[, 8]
a = str_match(obs.10$target, reg.0)
obs.10$meas = a[, 2]
obs.10$instr.lvl = a[, 5]
obs.10$instr.samp = a[, 8]

#==============================================
# Results tables
#==============================================

stage2.00 = data.14[with(data.14, grepl('trk_((elem)|(mid))_x$', x) &
                           (fe == 'yc') &
                           (instr.samp != 'bm') &
                           (flex == 'flex') &
                           (!((instr.lvl == 'cy') & (instr.samp == 'am'))) &
                           (!((instr.lvl == 'dy') & (instr.samp == 'dm'))) &
                           (!(((x == 'trk_elem_x') |
                                 (instr.lvl == 'dy')) &
                                (stat == 'ymean')))), ]

y.order.0 = c('scoreresid1'=1, 'scoreresid2'=2, 'scoreresid3'=3, 'scoreresid4'=4,
              'has_year1'=1, 'has_year2'=2, 'has_year3'=3, 'has_year4'=4,
              'clssz_48'=55, 'clssz_45'=6, 'clssz_68'=7,
              'peer_init_48'=58, 'peer_init_45'=9, 'peer_init_68'=10,
              'peer_prev_48'=61, 'peer_prev_45'=12, 'peer_prev_68'=13,
              'sdpeer_init_48'=64, 'sdpeer_init_45'=15, 'sdpeer_init_68'=16,
              'sdpeer_prev_48'=67, 'sdpeer_prev_45'=18, 'sdpeer_prev_68'=19,
              'over8'=20, 'under8'=21)

for(is.score in c(T, F)){
  stage2.0 = stage2.00[grepl('scoreresid[0-9]', stage2.00$y) == is.score, ]
  stage2.0$flex.flag = ifelse(stage2.0$flex == 'flex', 'flex_yes', 'flex_no')
  
  stage2.0$out.name = substr(stage2.0$y, 8, 30)
  stage2.0$pct.in = as.numeric(substr(stage2.0$y, 5, 6))
  x.order.0 = c('trk_elem_x'=1, 'trk_mid_x'=2, 'trk_elem_x2'=3, 'trk_mid_x2'=4)
  x.order.1 = c('b'=1, 'se'=2, 'ymean'=3)
  stage2.0$row.order = (10000 * y.order.0[stage2.0$out.name]) +
    (100 * x.order.0[stage2.0$x]) + x.order.1[stage2.0$stat]
  
  
  # 
  # order the columns properly
  # 
  spec.order.0 = c('unadj'=1, 'rel'=2)
  spec.order.1 = c('dm'=1, 'am'=2, 'bm'=3)
  spec.order.2 = c('cy'=1, 'dy'=2)
  spec.order.3 = c('samp0'=1, 'samp1'=2, 'samp2'=3, 'samp3'=4)
  spec.order.4 = c('mob_25'=1, 'mob_75'=2, 'mob_50'=3)
  
  stage2.0$col.order.num =
    (1000000 * spec.order.4[stage2.0$pred_pct]) +
    (100000000 * spec.order.0[stage2.0$meas]) +
    (10000 * stage2.0$pct.in) +
    (100 * spec.order.1[stage2.0$instr.samp]) +
    (1 * spec.order.2[stage2.0$instr.lvl]) +
    (1000000000 * spec.order.3[stage2.0$samp])
  stage2.0$col.order = paste('c', stage2.0$col.order.num, sep='')
  stage2.1 = as.data.frame(cast(stage2.0, row.order + out.name + x + stat ~
                                  col.order + samp + meas + pct.in +
                                  instr.samp + instr.lvl + fe,
                                value='cell.str'))
  
  
  # include appropriate gaps between rows
  a0 = c(1000, 1, 2, 1000, 3, 4, 1000, 5, 1000)
  a1 = c()
  for(i in 0:20){
    a1 = c(a1, a0 + (5 * i))
  }
  stage2.2 = stage2.1[a1, ]
  
  print(out.dir)
  setwd(out.dir)
  
  a = ifelse(is.score, 'score', 'other')
  write.csv(stage2.2, paste('track_mob_4-', a, '.csv', sep=''),
            na='', row.names=F)
}

#==============================================
# Postamble
#==============================================

END_TIME = Sys.time()
print("Time elapsed in track_mob_4.r:")
print(END_TIME - START_TIME)
